TFLearn [Participle Phrase] Fragment Detection

This notebook is based off the original fragment detection notebook, but specific to detection of participle phrase fragments. As our trainin g data we will use a datafile of 2,651 sentences with a participle phrase contained in them at the begining, middle, or end of the sentence, and 2,651 partiple phrases extracted from the sentences -- these raw participle phrases will always be fragments.The labels will be either a 1 or 0, where 1 indicates a partiple phrase fragment and 0 indicates that it is NOT a participle phrase fragment.

Install Dependencies



In [ ]:

    
import pandas as pd
import numpy as np
import tensorflow as tf
import tflearn
from tflearn.data_utils import to_categorical
import spacy
nlp = spacy.load('en')
import re
from nltk.util import ngrams, trigrams
import csv

Create combined data



In [ ]:

    
import subprocess

subprocess.Popen("python combine.py childrens_fragments".split(), cwd='../data/fragments/participle-phrases')

Load Datafiles



In [ ]:

    
texts = []
labels = []

with open("../data/fragments/participle-phrases/childrens_fragments.combined.txt","r") as f:
    for i, sentence_or_fragment in enumerate(f):
        if i % 2 == 0:
            labels.append(0)
        else:
            labels.append(1)
        texts.append(sentence_or_fragment.strip())
        
print(texts[-10:])

Shuffle the data



In [ ]:

    
import random

combined = list(zip(texts,labels))
random.shuffle(combined)

texts[:], labels[:] = zip(*combined)
print(texts[-10:])
print(labels[-10:])

Get parts of speech for text string



In [ ]:

    
def textStringToPOSArray(text):
    doc = nlp(text)
    tags = []
    for word in doc:
        tags.append(word.tag_)
    return tags

textStringToPOSArray(texts[3])

Get POS trigrams for a text string



In [ ]:

    
def find_ngrams(input_list, n):
  return zip(*[input_list[i:] for i in range(n)])

def getPOSTrigramsForTextString(text):
    tags = textStringToPOSArray(text)
    tgrams = list(trigrams(tags))
    return tgrams

print("Text: ", texts[3], labels[3])
getPOSTrigramsForTextString(texts[3])

Turn Trigrams into Dict keys



In [ ]:

    
def trigramsToDictKeys(trigrams):
    keys = []
    for trigram in trigrams:
        keys.append('>'.join(trigram))
    return keys

print(texts[2])
print(trigramsToDictKeys(getPOSTrigramsForTextString(texts[2])))



In [ ]:

    
from collections import Counter

c = Counter()

for textString in texts:
    c.update(trigramsToDictKeys(getPOSTrigramsForTextString(textString)))

total_counts = c

print("Total words in data set: ", len(total_counts))



In [ ]:

    
vocab = sorted(total_counts, key=total_counts.get, reverse=True)
print(vocab[:60])



In [ ]:

    
print(vocab[-1], ': ', total_counts[vocab[-1]])

Take the trigrams and index them



In [ ]:

    
word2idx = {n: i for i, n in enumerate(vocab)}## create the word-to-index dictionary here
print(word2idx)



In [ ]:

    
def textToTrigrams(text): 
    return trigramsToDictKeys(getPOSTrigramsForTextString(text))

def text_to_vector(text):
    wordVector = np.zeros(len(vocab))
    for word in textToTrigrams(text):
        index = word2idx.get(word, None)
        if index != None:
            wordVector[index] += 1
    return wordVector



In [ ]:

    
text_to_vector('Donald, standing on the precipice, began to dance.')[:65]



In [ ]:

    
word_vectors = np.zeros((len(texts), len(vocab)), dtype=np.int_)
for ii, text in enumerate(texts):
    word_vectors[ii] = text_to_vector(text)



In [ ]:

    
# Printing out the first 5 word vectors
word_vectors[:5, :23]

Chunking the data for TF



In [ ]:

    
records = len(labels)
test_fraction = 0.9

train_split, test_split = int(records*test_fraction), int(records*(1-test_fraction))
print(train_split, test_split)
trainX, trainY = word_vectors[:train_split], to_categorical(labels[:train_split], 2)
testX, testY = word_vectors[test_split:], to_categorical(labels[test_split:], 2)



In [ ]:

    
trainX[-1], trainY[-1]



In [ ]:

    
len(trainY), len(testY), len(trainY) + len(testY)

Setting up TF



In [ ]:

    
# Network building
def build_model():
    # This resets all parameters and variables, leave this here
    tf.reset_default_graph()
    
    #### Your code ####
    net = tflearn.input_data([None, len(vocab)])                          # Input
    net = tflearn.fully_connected(net, 200, activation='ReLU')      # Hidden
    net = tflearn.fully_connected(net, 25, activation='ReLU')      # Hidden
    net = tflearn.fully_connected(net, 2, activation='softmax')   # Output
    net = tflearn.regression(net, optimizer='sgd', learning_rate=0.1, loss='categorical_crossentropy')
    model = tflearn.DNN(net)

    return model



In [ ]:

    
len(vocab)

Initialize



In [ ]:

    
model = build_model()

Training



In [ ]:

    
# Training
model.fit(trainX, trainY, validation_set=0.1, show_metric=True, batch_size=128, n_epoch=50)



In [ ]:

    
# Testing
predictions = (np.array(model.predict(testX))[:,0] >= 0.5).astype(np.int_)
test_accuracy = np.mean(predictions == testY[:,0], axis=0)
print("Test accuracy: ", test_accuracy)



In [ ]:

    
w = csv.writer(open("../models/participlevocabindex.csv", "w"))
for key, val in word2idx.items():
    w.writerow([key, val])



In [ ]:

    
model.save("../models/participle_model.tfl")

Playground



In [ ]:

    
def test_sentence(sentence):
    positive_prob = model.predict([text_to_vector(sentence)])[0][1]
    print('Is this a participle phrase fragment?\n {}'.format(sentence))
    print('P(positive) = {:.3f} :'.format(positive_prob), 
          'Yes' if positive_prob > 0.5 else 'No')



In [ ]:

    
test_sentence("Neglecting to recognize the horrors those people endure allow people to go to war more easily.")



In [ ]:

    
test_sentence("Katherine, gesticulating wildly and dripping in sweat, kissed him on the cheek.")



In [ ]:

    
test_sentence("Working far into the night in an effort to salvage her little boat.")



In [ ]:

    
test_sentence("Working far into the night in an effort to salvage her little boat, she slowly grew tired.")



In [ ]:

    
test_sentence("Rushing to the rescue with his party.")



In [ ]:

    
test_sentence("Isobel was about thirteen now, and as pretty a girl, according to Buzzby, as you could meet with in any part of Britain.")



In [ ]:

    
test_sentence("Being of a modest and retiring disposition, Mr. Hawthorne avoided publicity.")



In [ ]:

    
test_sentence("Clambering to the top of a bridge, he observed a great rainbow")



In [ ]:

    
test_sentence("Clambering to the top of a bridge.")



In [ ]:

    
test_sentence("He observed a great rainbow.")



In [ ]:

    
test_sentence("Sitting on the iron throne, Joffry looked rather fat.")



In [ ]:

    
test_sentence("Worrying that a meteor or chunk of space debris will conk her on the head.")



In [ ]:

    
test_sentence("Aunt Olivia always wears a motorcycle helmet, worrying that a meteor or chunk of space debris will conk her on the head")



In [ ]:

    
test_sentence("Affecting the lives of many students in New York City.")



In [ ]:

    
test_sentence("Quill was a miracle, affecting the lives of many students in New York City.")



In [ ]:

    
test_sentence("Standing on the edge of the cliff looking down.")



In [ ]:

    
test_sentence("Emilia, standing on the edge of the cliff and looking down, began to weep.")



In [ ]:

    
test_sentence("Standing on the edge of the cliff and looking down, Emilia began to weep.")

Save the vocab



In [ ]:

    
vocab



In [ ]:



In [ ]:



In [ ]:



In [ ]:



In [ ]:



In [ ]: